scRNAseq: Bioconductor
DropletUtils: Bioconductor
scater: Bioconductor, GitHub, Paper
Demo Dataset: BachMammaryData from Differentiation dynamics of mammary epithelial cells revealed by single-cell RNA sequencing Nat Commun. 2017;8(1):2128.
License: GPL-3.0
Rcd /ngs/scRNA-seq-Analysis-Demo
R
suppressPackageStartupMessages({
library(scRNAseq)
library(scater)
library(ggbeeswarm) # geom_quasirandom
})c30 <- c("#1C86EE", #1 dodgerblue2
"#FF0000", #2 red1
"#008B00", #3 green4
"#FF7F00", #4 DarkOrange1
"#00FF00", #5 green1
"#A020F0", #6 purple
"#0000FF", #7 blue1
"#FF1493", #8 DeepPink1
"#8B4500", #9 DarkOrange4
"#000000", #10 black
"#FFD700", #11 gold1
"#00CED1", #12 DarkTurquoise
"#68228B", #13 DarkOrchid4
"#FF83FA", #14 orchid1
"#B3B3B3", #15 gray70
"#B03060", #16 maroon
"#7CCD7C", #17 PaleGreen3
"#333333", #18 gray20
"#D8BFD8", #19 thistle
"#FFC125", #20 goldenrod1
"#EEE685", #21 khaki2
"#7EC0EE", #22 SkyBlue2
"#36648B", #23 SteelBlue4
"#54FF9F", #24 SeaGreen1
"#8B8B00", #25 yellow4
"#CDCD00", #26 yellow3
"#F08080", #27 LightCoral
"#A52A2A", #28 brown
"#00008B", #29 blue4
"#CD2626" #30 firebrick3
)
pie(rep(1,30), col = c30, radius = 1)# Choosing colours for samples, n = 8
sample_col <- c30[c(1:8)]
# Choosing colours for clutsers, n = 15
cluster_col <- c30[c(1:15)]
# Choosing colours for Venn diagram, n = 3
circle.col <- c30[c(23,24,27)]In this example, we will use the example data set from the scRNAseq Bioconductor package. It contains expression matrices for several public scRNA-seq datasets in the form of SingleCellExperiment objects. The BachMammaryData function will download and import the mouse mammary gland single-cell RNA-seq data obtained with the 10x Genomics Chromium platform from Bach et al. (2017). The object contains 25,806 barcodes, cell annotations that includes the sample ID and condition, and the gene annotations that includes the Ensembl gene ID and gene symbol.
sce <- BachMammaryData()
sce## class: SingleCellExperiment
## dim: 27998 25806
## metadata(0):
## assays(1): counts
## rownames: NULL
## rowData names(2): Ensembl Symbol
## colnames: NULL
## colData names(3): Barcode Sample Condition
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
# Print the number of genes and cells in the object
paste0("Number of genes: ", nrow(sce))## [1] "Number of genes: 27998"
paste0("Number of cells: ", ncol(sce))## [1] "Number of cells: 25806"
# Cell info
str(colData(sce))## Formal class 'DFrame' [package "S4Vectors"] with 6 slots
## ..@ rownames : NULL
## ..@ nrows : int 25806
## ..@ listData :List of 3
## .. ..$ Barcode : chr [1:25806] "AAACCTGAGGCCATAG-1" "AAACCTGCATCGGGTC-1" "AAACGGGTCAAACGGG-1" "AAAGATGAGATAGCAT-1" ...
## .. ..$ Sample : chr [1:25806] "NP_1" "NP_1" "NP_1" "NP_1" ...
## .. ..$ Condition: Named chr [1:25806] "Nulliparous" "Nulliparous" "Nulliparous" "Nulliparous" ...
## .. .. ..- attr(*, "names")= chr [1:25806] "NP" "NP" "NP" "NP" ...
## ..@ elementType : chr "ANY"
## ..@ elementMetadata: NULL
## ..@ metadata : list()
# Gene info
str(rowData(sce))## Formal class 'DFrame' [package "S4Vectors"] with 6 slots
## ..@ rownames : NULL
## ..@ nrows : int 27998
## ..@ listData :List of 2
## .. ..$ Ensembl: chr [1:27998] "ENSMUSG00000051951" "ENSMUSG00000089699" "ENSMUSG00000102343" "ENSMUSG00000025900" ...
## .. ..$ Symbol : chr [1:27998] "Xkr4" "Gm1992" "Gm37381" "Rp1" ...
## ..@ elementType : chr "ANY"
## ..@ elementMetadata: NULL
## ..@ metadata : list()
Note: Skip if using the
BachMammaryDatadataset.
# Define sample ID
sample_id <- c("Sample1", "Sample2", "Sample3", "Sample4")To import scRNA-seq data generated by Cell Ranger, we can using the read10xCounts function from the DropletUtils package, which will produce a SingleCellExperiment object containing count data for each gene (row) and cell (column) across all samples.
Edit the script below to point cr_mat_path to the directory containing CellRanger output files: barcodes.tsv.gz, features.tsv.gz, and matrix.mtx.gz.
# Define data location
cr_mat_path <- "path-to-cellranger-output-folder" # a folder
# Check if these files exist: barcodes.tsv.gz, features.tsv.gz, matrix.mtx.gz
check_matrix_input <- function(cr_mat_path) {
error <- FALSE
if(! file.exists(paste0(cr_mat_path,"/barcodes.tsv.gz"))) {
error <- TRUE
print("'barcodes.tsv.gz' not found")
}
if(! file.exists(paste0(cr_mat_path,"/features.tsv.gz"))) {
error <- TRUE
print("'features.tsv.gz' not found")
}
if(! file.exists(paste0(cr_mat_path,"/matrix.mtx.gz"))) {
error <- TRUE
print("'matrix.mtx.gz' not found")
}
if(isTRUE(error)) {
stop("Stopped!")
} else {
print("All files found.")
}
}
check_matrix_input(cr_mat_path)
# Import data
sce <- DropletUtils::read10xCounts(cr_mat_path, sample.names = sample_id)
sce
# Print the number of genes and cells in the object
paste0("Number of genes: ", nrow(sce))
paste0("Number of cells: ", ncol(sce))Edit the script below to point cr_h5_path to the path of the filtered_feature_bc_matrix.h5 file.
# Define data location
cr_h5_file <- "path-to-cellranger-output-folder/filtered_feature_bc_matrix.h5" # a h5 file
# Check if this files exist
file.exists(cr_h5_file)
# Import data
sce <- DropletUtils::read10xCounts(cr_h5_file, sample.names = sample_id)
sce
# Print the number of genes and cells in the object
paste0("Number of genes: ", nrow(sce))
paste0("Number of cells: ", ncol(sce))Note: Skip if using the
BachMammaryDatadataset.
colnames(sce) <- colData(sce)$Barcode
rownames(sce) <- rowData(sce)$Symbol
# Cell info
str(colData(sce))
# Gene info
str(rowData(sce))We use grep to perform pattern matching to look for genes that are from the mitochondrial genome (chrM). The regular expression "^mt-|^MT-" will work for both human (MT-) and mouse (mt-) mitochondrial genomes. The ^ anchor is to ensure the pattern is matched to the beginning of the gene symbol.
# The subset feature can be supplied as character vector of feature names, a logical vector,
# or a numeric vector of indices
#is.mito <- grepl("^mt-|^MT-", rowData(sce)$Symbol) # a logical vector
is.mito <- grep("^mt-|^MT-", rowData(sce)$Symbol) # numeric vector of indices
# Print matched genes
rowData(sce)$Symbol[is.mito]## [1] "mt-Nd1" "mt-Nd2" "mt-Co1" "mt-Co2" "mt-Atp8" "mt-Atp6" "mt-Co3" "mt-Nd3"
## [9] "mt-Nd4l" "mt-Nd4" "mt-Nd5" "mt-Nd6" "mt-Cytb"
With an older version of scater, we would use the calculateQCMetrics function to add QC metrics, but it is now deprecated. We will instead use addPerCellQC and addPerFeatureQC and add additional metrics to colData and rowData instead.
Note: A pseudocount of 1 is added to avoid undefined values after the log-transformation.
# "sum" - sum of counts for the cell (library size)
# "detected" - number of genes for the cell that have counts above the detection limit (default 0)
sce <- addPerCellQC(sce, list(MT = is.mito))
# Add additional stats to per-cell QC
pseudocount = 1
colData(sce)$log10_sum <- log10(colData(sce)$sum + pseudocount)
colData(sce)$log10_detected <- log10(colData(sce)$detected + pseudocount)
colData(sce)$log10_genes_per_umi <- colData(sce)$log10_detected / colData(sce)$log10_sum# "mean" - mean counts for each gene across all cells
# "detected" - percentage of expressing cells, i.e.cells with non-zero counts for each gene
sce <- addPerFeatureQC(sce)sce## class: SingleCellExperiment
## dim: 27998 25806
## metadata(0):
## assays(1): counts
## rownames: NULL
## rowData names(4): Ensembl Symbol mean detected
## colnames: NULL
## colData names(16): Barcode Sample ... log10_detected log10_genes_per_umi
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
# Cell data
names(colData(sce))## [1] "Barcode" "Sample" "Condition"
## [4] "sum" "detected" "percent_top_50"
## [7] "percent_top_100" "percent_top_200" "percent_top_500"
## [10] "subsets_MT_sum" "subsets_MT_detected" "subsets_MT_percent"
## [13] "total" "log10_sum" "log10_detected"
## [16] "log10_genes_per_umi"
colData(sce)## DataFrame with 25806 rows and 16 columns
## Barcode Sample Condition sum detected percent_top_50
## <character> <character> <character> <numeric> <integer> <numeric>
## 1 AAACCTGAGGCCATAG-1 NP_1 Nulliparous 13152 3729 23.0383211678832
## 2 AAACCTGCATCGGGTC-1 NP_1 Nulliparous 12147 3301 27.6446859306825
## 3 AAACGGGTCAAACGGG-1 NP_1 Nulliparous 12095 3227 28.8879702356346
## 4 AAAGATGAGATAGCAT-1 NP_1 Nulliparous 12879 3771 21.3215311747806
## 5 AAAGATGAGATATGCA-1 NP_1 Nulliparous 12312 3627 26.9818063677713
## ... ... ... ... ... ... ...
## 25802 TTTGTCAGTGATAAGT-1 PI_2 Post-involution 13343 3386 28.9140373229409
## 25803 TTTGTCAGTGTAACGG-1 PI_2 Post-involution 3073 1345 29.6452977546372
## 25804 TTTGTCAGTGTTTGGT-1 PI_2 Post-involution 10834 3009 30.0812257707218
## 25805 TTTGTCATCTTGACGA-1 PI_2 Post-involution 6820 2293 28.2991202346041
## 25806 TTTGTCATCTTTAGGG-1 PI_2 Post-involution 4747 1950 31.303981461976
## percent_top_100 percent_top_200 percent_top_500 subsets_MT_sum
## <numeric> <numeric> <numeric> <numeric>
## 1 34.7475669099757 45.7877128953771 60.4090632603406 112
## 2 39.9110891578167 51.9305178233309 65.7034658763481 102
## 3 42.6953286482017 54.2290202563043 67.6395204630012 113
## 4 32.5335817998292 44.133861324637 58.9642052954422 105
## 5 38.9538661468486 50.219298245614 63.1172839506173 141
## ... ... ... ... ...
## 25802 40.8978490594319 52.1022258862325 66.1395488271003 76
## 25803 41.7832736739343 54.6046208916368 72.50244061178 7
## 25804 42.9204356654975 54.2274321580211 67.7496769429574 87
## 25805 39.3988269794721 50.7624633431085 66.8181818181818 41
## 25806 42.3846639983147 53.1072256161786 67.7269854645039 51
## subsets_MT_detected subsets_MT_percent total log10_sum log10_detected
## <integer> <numeric> <numeric> <numeric> <numeric>
## 1 7 0.851581508515815 13152 4.11902482011478 3.57170883180869
## 2 6 0.839713509508521 12147 4.08450478324623 3.51877706892677
## 3 6 0.934270359652749 12095 4.08264177815713 3.50893352605003
## 4 5 0.815280689494526 12879 4.10991586302379 3.57657168406529
## 5 6 1.14522417153996 12312 4.09036387947172 3.55966727838806
## ... ... ... ... ... ...
## 25802 7 0.569587049389193 13343 4.12528603329366 3.52981519664463
## 25803 2 0.227790432801822 3073 3.48770386316373 3.12904505988796
## 25804 7 0.803027505999631 10834 4.03482891565584 3.47856649559384
## 25805 5 0.601173020527859 6820 3.83384804953115 3.36059341356525
## 25806 6 1.07436275542448 4747 3.67651071028255 3.29025726939452
## log10_genes_per_umi
## <numeric>
## 1 0.867124862750683
## 2 0.861494172649779
## 3 0.85947622072146
## 4 0.87022990330364
## 5 0.870256872806093
## ... ...
## 25802 0.855653442732648
## 25803 0.897164777358584
## 25804 0.862134818677541
## 25805 0.876558843790438
## 25806 0.894940210616617
# Gene data
names(rowData(sce))## [1] "Ensembl" "Symbol" "mean" "detected"
rowData(sce)## DataFrame with 27998 rows and 4 columns
## Ensembl Symbol mean detected
## <character> <character> <numeric> <numeric>
## 1 ENSMUSG00000051951 Xkr4 0.0215066263659614 1.98403472060761
## 2 ENSMUSG00000089699 Gm1992 0.016662791598853 1.53840192203364
## 3 ENSMUSG00000102343 Gm37381 0 0
## 4 ENSMUSG00000025900 Rp1 0.000542509493916143 0.0542509493916144
## 5 ENSMUSG00000109048 Rp1 0 0
## ... ... ... ... ...
## 27994 ENSMUSG00000079808 AC168977.1 0.00213128729752771 0.205378594125397
## 27995 ENSMUSG00000095041 PISD 0.706347361078819 38.8436797643959
## 27996 ENSMUSG00000063897 DHRSX 0.34290475083314 25.668449197861
## 27997 ENSMUSG00000096730 Vmn2r122 0 0
## 27998 ENSMUSG00000095742 CAAA01147332.1 0.0175928078741378 1.73990544834535
In most cases, We can assume most of the cells are high quality and use the median absolute deviation (MAD) from the median approach to identify cells that are outliers, presumably representing low-quality cells. The threshold used here to determined if a cell is an outlier is if it is more than 3 MADs from the median, under a normal distribution this cutoff will retain 99% of cells. The log-transformed values will be applied to the input values (with log = TRUE) as this improves resolution at small values for distribution exhibit a heavy right tail, and avoid inflation of the MAD that might compromise outlier detection on the left tail.
Note: One should be aware of factors that could affect the distribution, for example certain cells can have a high metabolic rate and thus higher mitochondrial gene expression, or certain cells can express very few genes. In such cases, one might need to use fixed thresholds to filter low-quality cells.
qc.lib <- isOutlier(colData(sce)$sum, nmads = 3, log = TRUE, type = "lower")
qc.expr <- isOutlier(colData(sce)$detected, nmads = 3, log = TRUE, type = "lower")
qc.mito <- isOutlier(colData(sce)$subsets_MT_percent, nmads = 3, type = "higher")View the filter thresholds and determine if they are appropriate.
attr(qc.lib, "thresholds") # values lower than the "lower" threshold would be filtered## lower higher
## 924.5199 Inf
attr(qc.expr, "thresholds") # values lower than the "lower" threshold would be filtered## lower higher
## 582.1861 Inf
attr(qc.mito, "thresholds") # values higher than the "higher" threshold would be filtered## lower higher
## -Inf 1.632992
Note: The qc.mito threshold is very low, i.e. it will remove cells with mitochondrial proportions greater than ~1.63%. Usually cells with less than 10% are considered good quality and more than 20% are of poor quality. In this case, we will manually set a fixed threshold of 10%.
qc.mito.threshold = 10 # set at 10%
qc.mito <- colData(sce)$subsets_MT_percent > qc.mito.threshold
attr(qc.mito, "thresholds")["lower"] <- -Inf
attr(qc.mito, "thresholds")["higher"] <- qc.mito.threshold
attr(qc.mito, "thresholds")## lower higher
## -Inf 10
Summarize the number of cells removed for each reason. In this example, all cells passed the thresholds and none are considered an outlier.
discard <- qc.lib | qc.expr | qc.mito
colData(sce)$discard <- discard # Add a column in colData to store QC filter result
DataFrame(LibSize = sum(qc.lib), ExprGene = sum(qc.expr), MitoProp = sum(qc.mito),
Total = sum(discard))## DataFrame with 1 row and 4 columns
## LibSize ExprGene MitoProp Total
## <integer> <integer> <integer> <integer>
## 1 0 0 0 0
# Show as Venn diagram
limma::vennDiagram(data.frame(LibSize = qc.lib, ExprGene = qc.expr, MitoProp = qc.mito),
circle.col = circle.col)Now we are ready to inspect the distributions of various metrics and the thresholds chosen earlier. In an ideal case, we will see these metrics follow normal distributions and thus would justify the 3 MAD thresholds used in outlier detection. Afer assessing the plots, we can decide if the thresholds need adjustment to account for specific biological states or subset of cells, etc.
The cell counts are determined by the number of unique cellular barcodes detected. Ideally, the number of unique cellular barcodes will correpsond to the number of cells loaded. However the capture rates or capture efficiency of cells will affect the eventual cell counts. Accurate measure the input cell concentration is also important in determining the cell capture efficiency. Lastly, it is also possible to detect cell numbers that are much higher than what we loaded due to the experimental procedure. For example, there is a chance of obtaining only a barcoded bead in the emulsion droplet (GEM) and no actual cell with the 10X protocol.
Note: What were the expected cell counts in samples?
table(colData(sce)$Sample)##
## G_1 G_2 L_1 L_2 NP_1 NP_2 PI_1 PI_2
## 2915 3106 5906 3697 2249 2127 1500 4306
ggplot(as.data.frame(colData(sce)), aes(Sample, fill = Sample)) + geom_bar(color = "black") +
geom_text(stat = "count", aes(label = ..count..), vjust = -0.5, size = 5) +
scale_fill_manual(values = sample_col) + theme_classic(base_size = 12) +
theme(legend.position = "none") + ylab("Counts") + ggtitle("Cell counts")Next, we will consider the total number of RNA molecules detected per cell. The unique molecular identifier (UMI) counts should generally be above 500 (ref). Wells with few transcripts are likely to have been broken or failed to capture a cell, and should thus be removed. If UMI counts are between 500-1000 counts, it is usable but the cells probably should have been sequenced more deeply (ref).
View library size distributions with quantile.
Note: Are there any cells with low total UMI counts?
quantile(colData(sce)$sum, seq(0, 1, 0.1)) # 0% - 100% percentile## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90%
## 1724.0 2880.0 3704.0 4576.5 5359.0 6285.5 7434.0 8961.5 11088.0 15572.0
## 100%
## 116555.0
quantile(colData(sce)$sum, seq(0.9, 1, 0.1)) # 90% - 100% percentile (high read-depth)## 90% 100%
## 15572 116555
Visualise library size distributions with ggplot.
logbreak = scales::trans_breaks("log10", function(x) 10^x)
loglab = scales::trans_format("log10", scales::math_format(10^.x))
# histogram
p1 <- ggplot(as.data.frame(colData(sce)), aes(sum, color = Sample, fill = Sample)) +
geom_histogram(position = "identity", alpha = 0.4, bins = 50) +
scale_x_log10(breaks = logbreak, labels = loglab) +
geom_vline(xintercept = attr(qc.lib, "thresholds")[1], linetype = 2) + # show cutoff
scale_fill_manual(values = sample_col) + scale_color_manual(values = sample_col) +
theme_classic(base_size = 12) + xlab("Total UMI counts") +
ylab("Frequency") + ggtitle("Histogram of Library Size per Cell")
# violin plot
p2 <- ggplot(as.data.frame(colData(sce)), aes(Sample, sum, colour = discard)) +
geom_violin(width = 1) + scale_y_log10(breaks = logbreak, labels = loglab) +
geom_quasirandom(size = 0.2, alpha = 0.2, width = 0.5) +
geom_hline(yintercept = attr(qc.lib, "thresholds")[1], linetype = 2) + # show cutoff
scale_color_manual(values = c("blue", "red")) + theme_classic(base_size = 12) +
guides(color = guide_legend("Discard", override.aes = list(size = 1, alpha = 1))) +
theme(legend.position = "right") + ylab("Total UMI counts") +
ggtitle("Violin plot of Library Size perl Cell")
multiplot(p1, p2)Aside from having sufficient sequencing depth for each sample, we also expected to see reads distributed across the transcriptome. When visualised the expressed features (i.e. genes) in all the cells as a histogram or density plot, the plot should contain a single large peak for high quality data.
View expressed features distributions with quantile.
quantile(colData(sce)$detected, seq(0, 1, 0.1))## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 604 1223 1473 1731 1948 2167 2407 2710 3122 3786 8795
Visualise expressed features distributions with ggplot.
# histogram
p1 <- ggplot(as.data.frame(colData(sce)), aes(detected, color = Sample, fill = Sample)) +
geom_histogram(position = "identity", alpha = 0.4, bins = 50) +
scale_x_log10(breaks = logbreak, labels = loglab) +
geom_vline(xintercept = attr(qc.expr, "thresholds")[1], linetype = 2) + # show cutoff
scale_fill_manual(values = sample_col) + scale_color_manual(values = sample_col) +
theme_classic(base_size = 12) + xlab("Total expressed genes") +
ylab("Frequency") + ggtitle("Histogram of Expressed Features per Cell")
# violin
p2 <- ggplot(as.data.frame(colData(sce)), aes(Sample, detected, colour = discard)) +
geom_violin(width = 1) + scale_y_log10(breaks = logbreak, labels = loglab) +
geom_quasirandom(size = 0.2, alpha = 0.2, width = 0.5) +
geom_hline(yintercept = attr(qc.expr, "thresholds")[1], linetype = 2) + # show cutoff
scale_colour_manual(values = c("blue", "red")) + theme_classic(base_size = 12) +
guides(color = guide_legend("Discard", override.aes = list(size = 1, alpha = 1))) +
theme(legend.position = "right") + ylab("Total expressed genes") +
ggtitle("Violin plot of Expressed Features per Cell")
multiplot(p1, p2)The UMI count per cell and the number of genes detected per cell are often evaluated together. These two indices are usually strongly related, i.e. the higher UMI count for a cell, the more genes are detected as well. Cells that have a less complex RNA species (low number of genes detected per UMI), such as red blood cells, can often be detected by this metric. Generally, we expect the complexity score to be above 0.80 ref.
View complexity distributions with quantile.
quantile(colData(sce)$log10_genes_per_umi, seq(0, 1, 0.1))## 0% 10% 20% 30% 40% 50% 60% 70% 80%
## 0.7401947 0.8502186 0.8601689 0.8671018 0.8726119 0.8775765 0.8819169 0.8863240 0.8910505
## 90% 100%
## 0.8968736 0.9385222
Visualise complexity distributions with ggplot.
# histogram
p1 <- ggplot(as.data.frame(colData(sce)), aes(log10_genes_per_umi, color = Sample,
fill = Sample)) +
geom_histogram(position = "identity", alpha = 0.4, bins = 50) +
geom_vline(xintercept = 0.8, linetype = 2) +
scale_fill_manual(values = sample_col) + scale_color_manual(values = sample_col) +
theme_classic(base_size = 12) + xlab("log10 Gene per UMI") + ylab("Frequency") +
ggtitle("Histogram of Complexity of Gene Expression")
# scatter plot
p2 <- ggplot(as.data.frame(colData(sce)), aes(sum, detected, color = Sample)) +
scale_x_log10(breaks = logbreak, labels = loglab) +
scale_y_log10(breaks = logbreak, labels = loglab) +
geom_point(size = 0.6, alpha = 0.3) + facet_wrap(~ as.data.frame(colData(sce))$Sample) +
scale_color_manual(values = sample_col) + theme_classic(base_size = 12) +
guides(color = guide_legend("Sample", override.aes = list(size = 3, alpha = 1))) +
xlab("Total UMI counts") + ylab("Total expressed genes") +
ggtitle("UMIs vs. Expressed Genes")
multiplot(p1, p2)The mitochondrial proportions in cells is an useful indicator of cell quality. High proportion of counts assigned to mitochondrial genes is an indication of damaged, dying and dead cells, whereby cytoplasmic mRNA has leaked out through a broken membrane, hence only the mRNA located in the mitochondria is preserved and being sequenced.
View distributions with quantile.
Note: Are there any cells with high expression of mitochondrial genes (>20% of total counts in a cell)?
quantile(colData(sce)$subsets_MT_percent, seq(0, 1, 0.1))## 0% 10% 20% 30% 40% 50% 60% 70% 80%
## 0.0000000 0.4207468 0.5272408 0.6069388 0.6817726 0.7510203 0.8267616 0.9212001 1.0394179
## 90% 100%
## 1.2505218 8.2174054
Visualise distributions with ggplot.
# histogram
p1 <- ggplot(as.data.frame(colData(sce)), aes(x = subsets_MT_percent, color = Sample,
fill = Sample)) +
geom_histogram(position = "identity", alpha = 0.4, bins = 50) +
geom_vline(xintercept = attr(qc.mito, "thresholds")[2], linetype = 2) + # show cutoff
scale_fill_manual(values = sample_col) + scale_color_manual(values = sample_col) +
theme_classic(base_size = 12) + xlab("% counts from mitochondrial genes") +
ylab("Frequency") + ggtitle("Histogram of Mitochondrial Proportions per Cell")
# violin
p2 <- ggplot(as.data.frame(colData(sce)), aes(Sample, subsets_MT_percent, colour = discard)) +
geom_violin(width = 1) + geom_quasirandom(size = 0.2, alpha = 0.2, width = 0.5) +
geom_hline(yintercept = attr(qc.mito, "thresholds")[2], linetype = 2) + # show cutoff
scale_colour_manual(values = c("blue", "red")) + theme_classic(base_size = 12) +
guides(color = guide_legend("Discard", override.aes = list(size = 1, alpha = 1))) +
theme(legend.position = "right") + ylab("% counts from mitochondrial genes") +
ggtitle("Violin plot of Mitochondrial Proportions per Cell")
multiplot(p1, p2)After reviewing the diagnostic plots, we will proceed with removing low-quality cells from sce with the thresholds that were selected in Identify low-quality cells. You can also change the thresholds that is suitable for your study to prevent removing biologically relevant cells.
In this example, none of the cells are outliers, and hence none were removed.
sce.filt <- sce[, !discard]
sce.filt## class: SingleCellExperiment
## dim: 27998 25806
## metadata(0):
## assays(1): counts
## rownames: NULL
## rowData names(4): Ensembl Symbol mean detected
## colnames: NULL
## colData names(17): Barcode Sample ... log10_genes_per_umi discard
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
Because poor-quality cells were removed, we will update the per-feature QC metrics whereby some calculations uses cell data.
rowData(sce.filt) <- cbind(rowData(sce.filt)[,c("Ensembl", "Symbol")],
perFeatureQCMetrics(sce.filt))
# Add additional stats to per-feature QC
rowData(sce.filt)$n_cells <- rowData(sce.filt)$detected/100 * ncol(sce.filt) # number of expressing cells
rowData(sce.filt)$pct_dropout <- 100 - rowData(sce.filt)$detected # percentage of cells with zero counts
rowData(sce.filt)$total_counts <- rowData(sce.filt)$mean * ncol(sce.filt)Show number of genes that are not expressed in any cell.
not.expressed <- rowData(sce.filt)$n_cells == 0
table(not.expressed)## not.expressed
## FALSE TRUE
## 21140 6858
We remove genes that are not expressed in any cell
sce.filt <- sce.filt[!not.expressed,]
sce.filt## class: SingleCellExperiment
## dim: 21140 25806
## metadata(0):
## assays(1): counts
## rownames: NULL
## rowData names(7): Ensembl Symbol ... pct_dropout total_counts
## colnames: NULL
## colData names(17): Barcode Sample ... log10_genes_per_umi discard
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
Aside from the 6858 genes that are not expressed in any cell, there are also genes that have extremely low average expression across all cells. We can also observe a positive relationship between the number of of expressing cells versus mean counts of all the genes in the expression matrix.
# histogram
p1 <- ggplot(as.data.frame(rowData(sce.filt)), aes(x = mean)) +
geom_histogram(position = "identity", bins = 100, fill = "white", color = "black") +
scale_x_log10(breaks = logbreak, labels = loglab) +
geom_vline(xintercept = 0.005, linetype = 2, color = "cyan") + # mean count = 0.005
geom_vline(xintercept = 0.010, linetype = 2, color = "blue") + # mean count = 0.010
geom_vline(xintercept = 0.020, linetype = 2, color = "red") + # mean count = 0.020
theme_classic(base_size = 12) + xlab("Mean counts") +
ylab("Frequency") + ggtitle("Histogram of Mean Counts")
# scatter plot
p2 <- ggplot(as.data.frame(rowData(sce.filt)), aes(n_cells, mean, color = pct_dropout)) +
geom_point(size = 0.6, alpha = 0.3) + theme_classic(base_size = 12) +
scale_x_log10(breaks = logbreak, labels = loglab) +
scale_y_log10(breaks = logbreak, labels = loglab) +
geom_hline(yintercept = 0.005, linetype = 2, color = "cyan") +
geom_hline(yintercept = 0.010, linetype = 2, color = "blue") +
geom_hline(yintercept = 0.020, linetype = 2, color = "red") +
guides(color = guide_legend("Pct Dropouts", override.aes = list(size = 3, alpha = 1))) +
xlab("Number of expressing cells") + ylab("Mean counts") +
ggtitle("Expressing Cells vs. Mean Counts")
multiplot(p1, p2)After assessing the QC metrics on features, we can proceeed to remove genes that is considered “undetectable”. This can be done by define a gene as detectable if the average expression is above a certain threshold, or a N number of cells expressing X number of the transcript.
# Average expression more than 0.02
qc.gene1 <- rowData(sce.filt)$mean > 0.02
# 2 transcript detected in at least 0.1% of total cells
ncell <- ncol(sce.filt) * 0.001
ncell## [1] 25.806
qc.gene2 <- nexprs(sce.filt, byrow = TRUE, detection_limit = 2) >= ncell
x = data.frame(n_cells = rowData(sce.filt)$n_cells, mean = rowData(sce.filt)$mean,
qc1 = qc.gene1, qc2 = qc.gene2)
x$cond = paste0(x$qc1, x$qc2)
x$cond = as.factor(x$cond)
levels(x$cond) = c("Failed (Both)", "Failed (mean)", "Failed (nexprs)", "Pass")
table(x$cond)##
## Failed (Both) Failed (mean) Failed (nexprs) Pass
## 9605 82 3191 8262
# Show as Venn diagram
limma::vennDiagram(data.frame(MeanExpr = qc.gene1, kOverA = qc.gene2),
circle.col = c("blue", "red"), main = "Detectable Genes")# scatter plots
p3 <- ggplot(x, aes(n_cells, mean, color = cond)) +
geom_point(size = 0.6, alpha = 0.3) + theme_classic(base_size = 12) +
scale_x_log10(breaks = logbreak, labels = loglab) +
scale_y_log10(breaks = logbreak, labels = loglab) +
geom_hline(yintercept = 0.02, linetype = 2, color = "blue") + # mean count = 0.005
guides(color = guide_legend("Feature Status", override.aes = list(size = 3, alpha = 1))) +
xlab("Number of expressing cells") + ylab("Mean counts") +
ggtitle("Expressing Cells vs. Mean Counts")
p4 <- ggplot(x, aes(n_cells, mean, color = cond)) + geom_point(size = 0.6, alpha = 0.3) +
facet_wrap(~ cond) + theme_classic(base_size = 12) +
scale_x_log10(breaks = logbreak, labels = loglab) +
scale_y_log10(breaks = logbreak, labels = loglab) +
geom_hline(yintercept = 0.02, linetype = 2, color = "blue") + # mean count = 0.005
guides(color = guide_legend("Feature Status", override.aes = list(size = 3, alpha = 1))) +
xlab("Number of expressing cells") + ylab("Mean counts") +
ggtitle("Expressing Cells vs. Mean Counts")
multiplot(p3, p4)Here we will use the classical kOverA approach to select genes that can be detected (2 transcripts) in at least 0.1% of total cells.
sce.filt <- sce.filt[qc.gene2,]
sce.filt## class: SingleCellExperiment
## dim: 8344 25806
## metadata(0):
## assays(1): counts
## rownames: NULL
## rowData names(7): Ensembl Symbol ... pct_dropout total_counts
## colnames: NULL
## colData names(17): Barcode Sample ... log10_genes_per_umi discard
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
Because lowly-expressed genes were removed, we will update the per-cell QC metrics whereby some calculations uses gene data.
colData(sce.filt) <- cbind(colData(sce.filt)[,c("Barcode","Sample","Condition")],
perCellQCMetrics(sce.filt, list(MT = grep("^mt-|^MT-", rowData(sce.filt)$Symbol))))Use the calculateAverage function to average counts per feature after normalizing with size factors.
rowData(sce.filt)$ave_counts <- calculateAverage(sce.filt)
r.squared = summary(lm(mean ~ ave_counts, rowData(sce.filt)))$r.squared
r.squared = as.expression(bquote(R^2 == .(round(r.squared, 4))))
ggplot(as.data.frame(rowData(sce.filt)), aes(mean, ave_counts)) +
geom_point(size = 0.6, alpha = 0.3) + theme_classic(base_size = 12) +
scale_x_log10(breaks = logbreak, labels = loglab) +
scale_y_log10(breaks = logbreak, labels = loglab) +
geom_smooth(method = "lm", se = FALSE) +
annotate("text", label = r.squared, x = 0.1, y = 100, size = 4) +
xlab("Mean counts") + ylab("Size-adjusted average count") +
ggtitle("Mean Counts vs. Size-adjusted Average Count")sce.filtIn addition to the count data in assays, we will also add the log2-transformed counts to assays.
# Accessing the assay data
dim(assay(sce.filt, "counts"))## [1] 8344 25806
assay(sce.filt, "logcounts") <- log2(counts(sce.filt) + pseudocount)The effective library sizes are used as the denominator of the CPM calculation.
# More about lib.sizes calculation in calculateCPM()
# x is a numeric matrix of counts
x <- assay(sce, "counts")
size.factors <- colSums(x)/mean(colSums(x)) # same as scater::librarySizeFactors(x)
lib.sizes <- colSums(x) / 1e6 # count-per-million
lib.sizes <- size.factors / mean(size.factors) * mean(lib.sizes) # normalisation
assay(sce.filt, "logcpm") <- log2(calculateCPM(sce.filt) + pseudocount)# Assay info
str(assays(sce.filt))## Formal class 'SimpleList' [package "S4Vectors"] with 4 slots
## ..@ listData :List of 3
## .. ..$ counts :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
## .. .. .. ..@ i : int [1:56501178] 5 7 8 9 11 12 13 16 17 21 ...
## .. .. .. ..@ p : int [1:25807] 0 3424 6458 9443 12904 16161 17983 20490 23122 25199 ...
## .. .. .. ..@ Dim : int [1:2] 8344 25806
## .. .. .. ..@ Dimnames:List of 2
## .. .. .. .. ..$ : NULL
## .. .. .. .. ..$ : NULL
## .. .. .. ..@ x : num [1:56501178] 1 2 6 1 1 4 1 1 1 1 ...
## .. .. .. ..@ factors : list()
## .. ..$ logcounts:Formal class 'dgeMatrix' [package "Matrix"] with 4 slots
## .. .. .. ..@ x : num [1:215325264] 0 0 0 0 0 ...
## .. .. .. ..@ Dim : int [1:2] 8344 25806
## .. .. .. ..@ Dimnames:List of 2
## .. .. .. .. ..$ : NULL
## .. .. .. .. ..$ : NULL
## .. .. .. ..@ factors : list()
## .. ..$ logcpm :Formal class 'dgeMatrix' [package "Matrix"] with 4 slots
## .. .. .. ..@ x : num [1:215325264] 0 0 0 0 0 ...
## .. .. .. ..@ Dim : int [1:2] 8344 25806
## .. .. .. ..@ Dimnames:List of 2
## .. .. .. .. ..$ : NULL
## .. .. .. .. ..$ : NULL
## .. .. .. ..@ factors : list()
## ..@ elementType : chr "ANY"
## ..@ elementMetadata: NULL
## ..@ metadata : list()
save.image("BachMammary.RData")sessionInfo()## R version 3.6.3 (2020-02-29)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS/LAPACK: /home/ihsuan/miniconda3/envs/jupyterlab/lib/libopenblasp-r0.3.10.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C LC_TIME=en_GB.UTF-8
## [4] LC_COLLATE=en_GB.UTF-8 LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets methods
## [9] base
##
## other attached packages:
## [1] ggbeeswarm_0.6.0 scater_1.14.6 ggplot2_3.3.2
## [4] scRNAseq_2.0.2 SingleCellExperiment_1.8.0 SummarizedExperiment_1.16.1
## [7] DelayedArray_0.12.3 BiocParallel_1.20.1 matrixStats_0.56.0
## [10] Biobase_2.46.0 GenomicRanges_1.38.0 GenomeInfoDb_1.22.1
## [13] IRanges_2.20.2 S4Vectors_0.24.4 BiocGenerics_0.32.0
## [16] knitr_1.29
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-149 bitops_1.0-6
## [3] bit64_4.0.5 httr_1.4.2
## [5] tools_3.6.3 R6_2.4.1
## [7] irlba_2.3.3 vipor_0.4.5
## [9] DBI_1.1.0 mgcv_1.8-33
## [11] colorspace_1.4-1 withr_2.2.0
## [13] tidyselect_1.1.0 gridExtra_2.3
## [15] bit_4.0.4 curl_4.3
## [17] compiler_3.6.3 BiocNeighbors_1.4.2
## [19] labeling_0.3 scales_1.1.1
## [21] rappdirs_0.3.1 stringr_1.4.0
## [23] digest_0.6.25 rmarkdown_2.3
## [25] XVector_0.26.0 pkgconfig_2.0.3
## [27] htmltools_0.5.0 dbplyr_1.4.4
## [29] fastmap_1.0.1 limma_3.42.2
## [31] rlang_0.4.7 RSQLite_2.2.0
## [33] shiny_1.5.0 DelayedMatrixStats_1.8.0
## [35] generics_0.0.2 farver_2.0.3
## [37] dplyr_1.0.2 RCurl_1.98-1.2
## [39] magrittr_1.5 BiocSingular_1.2.2
## [41] GenomeInfoDbData_1.2.2 Matrix_1.2-18
## [43] Rcpp_1.0.5 munsell_0.5.0
## [45] viridis_0.5.1 lifecycle_0.2.0
## [47] stringi_1.4.6 yaml_2.2.1
## [49] zlibbioc_1.32.0 BiocFileCache_1.10.2
## [51] AnnotationHub_2.18.0 grid_3.6.3
## [53] blob_1.2.1 promises_1.1.1
## [55] ExperimentHub_1.12.0 crayon_1.3.4
## [57] lattice_0.20-41 splines_3.6.3
## [59] pillar_1.4.6 glue_1.4.2
## [61] BiocVersion_3.10.1 evaluate_0.14
## [63] BiocManager_1.30.10 vctrs_0.3.4
## [65] httpuv_1.5.4 gtable_0.3.0
## [67] purrr_0.3.4 assertthat_0.2.1
## [69] xfun_0.16 rsvd_1.0.3
## [71] mime_0.9 xtable_1.8-4
## [73] later_1.1.0.1 viridisLite_0.3.0
## [75] tibble_3.0.3 AnnotationDbi_1.48.0
## [77] beeswarm_0.2.3 memoise_1.1.0
## [79] ellipsis_0.3.1 interactiveDisplayBase_1.24.0